library(dplyr)
library(readr)
histologies_df <- read_tsv("../../data/pbta-histologies.tsv")
Parsed with column specification:
cols(
  .default = col_character(),
  OS_days = col_double(),
  age_last_update_days = col_double(),
  normal_fraction = col_double(),
  tumor_fraction = col_double(),
  tumor_ploidy = col_double()
)
See spec(...) for full column specifications.
chordoma_samples <- histologies_df %>%
  filter(short_histology=="Chordoma") %>% 
  pull(Kids_First_Biospecimen_ID)
focal_cn_df <- read_tsv("../focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz")
Parsed with column specification:
cols(
  biospecimen_id = col_character(),
  status = col_character(),
  copy_number = col_double(),
  ploidy = col_double(),
  ensembl = col_character(),
  gene_symbol = col_character(),
  cytoband = col_character()
)
chordoma_loss <- focal_cn_df %>% 
  filter(biospecimen_id %in% chordoma_samples, gene_symbol=="SMARCB1")

chordoma_loss 
#we need to include the sample_id field from pbta-histologies.tsv in the final table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression values) and WGS data (e.g., SMARCB1 focal copy #number status) from the same event for a given individual).
#To get the SMARCB1 jitter plot in the photo here #250 (comment), you will first need to read in the #collapsed expression data

expression_data <- read_rds(file.path("..", "..", "data", "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"))
expression_data
chordoma_id_df <- histologies_df %>% 
  # only rows with chordoma samples
  filter(short_histology == "Chordoma") %>%
  # select only these columns that we'll need later
  select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
         experimental_strategy)
chordoma_id_df
copy_neutral_df <- chordoma_id_df %>% 
  # the copy events can only be taken from WGS data not RNA-seq data
  # we also only want biospecimens where a loss was not recorded to avoid duplicates
  filter(experimental_strategy == "WGS",
         !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>%
  # if there's no loss, let's assume status is copy neutral
  mutate(status = "neutral") %>%
  # let's get the columns to match chordoma_loss
  rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
  select(biospecimen_id, status)
copy_neutral_df
chordoma_copy <- chordoma_loss %>% 
  #join the losses with the neutrals to get a new data frame
  select(biospecimen_id, status) %>%
  bind_rows(copy_neutral_df)
chordoma_copy

Need to get the sample_id that corresponds to biospecimen_id into chordoma_copy so we can match WGS and RNA-seq biospecimens from the same event/sample:

chordoma_copy <- chordoma_copy %>%
  # get only the Kids_First_Biospecimen_ID, sample_id columns from our identifier data.frame
  # then use biospecimen IDs to add the sample_id info
  inner_join(select(chordoma_id_df,
                    Kids_First_Biospecimen_ID,
                    sample_id),
             by = c("biospecimen_id" = "Kids_First_Biospecimen_ID"))
chordoma_copy

look at SMARCB1 expression values only in chordoma

# get the row that contains the SMARCB1 values
# gene symbols are rownames
smarcb1_expression <- expression_data[which(rownames(expression_data) == "SMARCB1"), ]

# now only the columns correspond to chordoma samples
smarcb1_expression <- smarcb1_expression[, which(colnames(expression_data) %in% chordoma_samples) ]
smarcb1_expression

The smarcb1_expression is a not a friendly form ^^; Transposing needed:

# transpose such that samples are rows
smarcb1_expression <- t(smarcb1_expression) %>%
  # make a data.frame
  as.data.frame() %>%
  # we want the rownames that are biospecimen identifers as their own column called Kids_First_Biospecimen_ID
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  # give SMARCB1 column a slightly better column name
  rename(SMARCB1_expression = SMARCB1)
smarcb1_expression

This also needs sample_id to add it in:

smarcb1_expression <- smarcb1_expression %>%
  inner_join(select(chordoma_id_df,
                    Kids_First_Biospecimen_ID,
                    sample_id),
             by = "Kids_First_Biospecimen_ID")
smarcb1_expression
chordoma_smarcb1_df <- smarcb1_expression %>%
  # any missing samples will get filled with NA when using a full join
  full_join(chordoma_copy, by = "sample_id")
chordoma_smarcb1_df
# this step adds in the participant identifier (sample_id to match between the two data.frame)
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  inner_join(distinct(select(chordoma_id_df, 
                             sample_id, 
                             Kids_First_Participant_ID)),
             by = "sample_id")

# combining the two biospecimen identifiers to a single column (all biospecimen IDs for a sampl separated by a comma)
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  mutate(Kids_First_Biospecimen_ID = if_else(
    # one sample is missing WGS data, so if that's true only include biospecimen ID from RNA-seq
    is.na(biospecimen_id),
    Kids_First_Biospecimen_ID,
    paste(Kids_First_Biospecimen_ID, biospecimen_id, sep = ", ")
  ))
chordoma_smarcb1_df
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  select(Kids_First_Participant_ID, 
         Kids_First_Biospecimen_ID, 
         sample_id,
         status,
         SMARCB1_expression) %>%
  # 'status' is replaced a more descriptive name
  rename(focal_SMARCB1_status = status)
chordoma_smarcb1_df
library(ggplot2)
# this specifies that this is the data we want to plot
chordoma_smarcb1_df %>%
  # drop the sample that doesn't have WGS data
  tidyr::drop_na() %>%
  # this step specifies what should go on the x- and y-axes
  ggplot(aes(x = focal_SMARCB1_status,
             y = SMARCB1_expression)) +
  # we want a jitter plot where the points aren't too far 
  # apart that's what width does
  geom_jitter(width = 0.1) +
  # this is plotting the median as a blue diamond
  stat_summary(fun.y = "median", 
               geom = "point", 
               size = 3, 
               color = "blue", 
               shape = 18)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

---
title: "Subgtyping-chordoma"
output: html_notebook
author: Mateusz Koptyra 
date: 20191121
---

```{r}
library(dplyr)
library(readr)
```

```{r}
histologies_df <- read_tsv("../../data/pbta-histologies.tsv")
```
```{r}
chordoma_samples <- histologies_df %>%
  filter(short_histology=="Chordoma") %>% 
  pull(Kids_First_Biospecimen_ID)
```

```{r}
focal_cn_df <- read_tsv("../focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz")
```

```{r}
chordoma_loss <- focal_cn_df %>% 
  filter(biospecimen_id %in% chordoma_samples, gene_symbol=="SMARCB1")

chordoma_loss 
```

```{r}
#we need to include the sample_id field from pbta-histologies.tsv in the final table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression values) and WGS data (e.g., SMARCB1 focal copy #number status) from the same event for a given individual).
#To get the SMARCB1 jitter plot in the photo here #250 (comment), you will first need to read in the #collapsed expression data

expression_data <- read_rds(file.path("..", "..", "data", "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"))
expression_data
```

```{r}
chordoma_id_df <- histologies_df %>% 
  # only rows with chordoma samples
  filter(short_histology == "Chordoma") %>%
  # select only these columns that we'll need later
  select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
         experimental_strategy)
chordoma_id_df
```

```{r}
copy_neutral_df <- chordoma_id_df %>% 
  # the copy events can only be taken from WGS data not RNA-seq data
  # we also only want biospecimens where a loss was not recorded to avoid duplicates
  filter(experimental_strategy == "WGS",
         !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>%
  # if there's no loss, let's assume status is copy neutral
  mutate(status = "neutral") %>%
  # let's get the columns to match chordoma_loss
  rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
  select(biospecimen_id, status)
copy_neutral_df
```

```{r}
chordoma_copy <- chordoma_loss %>% 
  #join the losses with the neutrals to get a new data frame
  select(biospecimen_id, status) %>%
  bind_rows(copy_neutral_df)
chordoma_copy
```
Need to get the sample_id that corresponds to biospecimen_id into chordoma_copy so we can match WGS and RNA-seq biospecimens from the same event/sample:
```{r}
chordoma_copy <- chordoma_copy %>%
  # get only the Kids_First_Biospecimen_ID, sample_id columns from our identifier data.frame
  # then use biospecimen IDs to add the sample_id info
  inner_join(select(chordoma_id_df,
                    Kids_First_Biospecimen_ID,
                    sample_id),
             by = c("biospecimen_id" = "Kids_First_Biospecimen_ID"))
chordoma_copy
```
look at SMARCB1 expression values only in chordoma

```{r}
# get the row that contains the SMARCB1 values
# gene symbols are rownames
smarcb1_expression <- expression_data[which(rownames(expression_data) == "SMARCB1"), ]

# now only the columns correspond to chordoma samples
smarcb1_expression <- smarcb1_expression[, which(colnames(expression_data) %in% chordoma_samples) ]
smarcb1_expression
```

The smarcb1_expression is a not a friendly form ^^; Transposing needed: 
```{r}
# transpose such that samples are rows
smarcb1_expression <- t(smarcb1_expression) %>%
  # make a data.frame
  as.data.frame() %>%
  # we want the rownames that are biospecimen identifers as their own column called Kids_First_Biospecimen_ID
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  # give SMARCB1 column a slightly better column name
  rename(SMARCB1_expression = SMARCB1)
smarcb1_expression
```
This also needs sample_id to add it in:
```{r}
smarcb1_expression <- smarcb1_expression %>%
  inner_join(select(chordoma_id_df,
                    Kids_First_Biospecimen_ID,
                    sample_id),
             by = "Kids_First_Biospecimen_ID")
smarcb1_expression
```
```{r}
chordoma_smarcb1_df <- smarcb1_expression %>%
  # any missing samples will get filled with NA when using a full join
  full_join(chordoma_copy, by = "sample_id")
chordoma_smarcb1_df
```

```{r}
# this step adds in the participant identifier (sample_id to match between the two data.frame)
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  inner_join(distinct(select(chordoma_id_df, 
                             sample_id, 
                             Kids_First_Participant_ID)),
             by = "sample_id")

# combining the two biospecimen identifiers to a single column (all biospecimen IDs for a sampl separated by a comma)
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  mutate(Kids_First_Biospecimen_ID = if_else(
    # one sample is missing WGS data, so if that's true only include biospecimen ID from RNA-seq
    is.na(biospecimen_id),
    Kids_First_Biospecimen_ID,
    paste(Kids_First_Biospecimen_ID, biospecimen_id, sep = ", ")
  ))
chordoma_smarcb1_df
```

```{r}
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  select(Kids_First_Participant_ID, 
         Kids_First_Biospecimen_ID, 
         sample_id,
         status,
         SMARCB1_expression) %>%
  # 'status' is replaced a more descriptive name
  rename(focal_SMARCB1_status = status)
chordoma_smarcb1_df
```

```{r}
library(ggplot2)
```

```{r}
# this specifies that this is the data we want to plot
chordoma_smarcb1_df %>%
  # drop the sample that doesn't have WGS data
  tidyr::drop_na() %>%
  # this step specifies what should go on the x- and y-axes
  ggplot(aes(x = focal_SMARCB1_status,
             y = SMARCB1_expression)) +
  # we want a jitter plot where the points aren't too far 
  # apart that's what width does
  geom_jitter(width = 0.1) +
  # this is plotting the median as a blue diamond
  stat_summary(fun.y = "median", 
               geom = "point", 
               size = 3, 
               color = "blue", 
               shape = 18)
```

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file). 

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

